# Replication Code: Can (Thin) Populism be manipulated without manipulating Host Ideology? Evidence from a conjoint validation approach
# Script 02: Data Cleaning, Recoding, and Analysis of US Wide Data

# Ensure packages from script_01 are loaded and Prolific_US_Raw.csv is in working directory


# Load Raw US Data

filename_US <- "Prolific_US_Raw.csv"
headers_US = read_csv(filename_US, col_names = FALSE, n_max = 1)
data_wide_US = read_csv(filename_US, skip = 3, col_names = FALSE)
colnames(data_wide_US)= headers_US

# Overview of variables
names(data_wide_US)

## Data cleaning and examination

# Information on Survey Completion
table(data_wide_US$Finished)
table(data_wide_US$Finished, data_wide_US$Sample) 

# Keep only completed responses
data_wide_US <- data_wide_US[(data_wide_US$Finished==1),]
table(data_wide_US$Finished)
table(data_wide_US$Sample)

# Information on Survey Duration
summary(data_wide_US$`Duration (in seconds)`)

# Mock Vignette Attention Check
table(data_wide_US$attent_vignette_out)
data_wide_US$attentive <- car::recode(data_wide_US$attent_vignette_out, "1:3=0; 4=1; 5:6=0")
table(data_wide_US$attentive)
table(data_wide_US$attentive, data_wide_US$Sample)

# Basic Manipulation Check (Partisan Information)
table(data_wide_US$partisanship_mention, data_wide_US$Sample)


## Variable recodes 

# Sex
table(data_wide_US$gender)
data_wide_US$gender <- as.factor(data_wide_US$gender)
levels(data_wide_US$gender) <- c("Male", "Female", "Other")

# Race
table(data_wide_US$race)
data_wide_US$race <- as.factor(data_wide_US$race)
levels(data_wide_US$race) <- c("White", "Black", "Hispanic", "Asian", "Other")

# Age
table(data_wide_US$yearborn)
data_wide_US$age <- ifelse(data_wide_US$yearborn > 1000, 2024 - data_wide_US$yearborn, data_wide_US$yearborn)
table(data_wide_US$age)

data_wide_US <- data_wide_US %>% mutate(agegroup = case_when(age >= 65  & age <= 100 ~ '65-100',
                                                       age >= 50  & age <= 64 ~ '50-64', 
                                                       age >= 36  & age <= 49 ~ '36-49',
                                                       age >= 26  & age <= 35 ~ '26-35',
                                                       age >= 18  & age <= 25 ~ '18-25'))

# Education
table(data_wide_US$educ)
data_wide_US$educ <- as.factor(data_wide_US$educ)
levels(data_wide_US$educ) <- c("Some high school", "High school graduate", "Some college", "College graduate", "Some graduate school" , "Graduate school degree")

# Income
table(data_wide_US$income)
data_wide_US$income <- as.factor(data_wide_US$income)
levels(data_wide_US$income) <- c("Under $20,000", "$20,001 – $50,000", "$50,001 – $100,000", "$100,001 – $150,000", "Above $150,000", "Prefer not to answer")

# Partisanship

table(data_wide_US$pid0, data_wide_US$pid_Ind)
table(data_wide_US$pid0, data_wide_US$pid_leanD)
table(data_wide_US$pid0, data_wide_US$pid_leanR)

data_wide_US$pid7 <- ifelse(data_wide_US$pid0==2 & data_wide_US$pid_leanD==1, 1,
                         ifelse(data_wide_US$pid0==2 & data_wide_US$pid_leanD==2, 2,
                                ifelse(data_wide_US$pid0==3 & data_wide_US$pid_Ind==2, 3,
                                       ifelse(data_wide_US$pid0==3 & data_wide_US$pid_Ind==3, 4,
                                              ifelse(data_wide_US$pid0==3 & data_wide_US$pid_Ind==1, 5,
                                                     ifelse(data_wide_US$pid0==1 & data_wide_US$pid_leanR==2, 6,
                                                            ifelse(data_wide_US$pid0==1 & data_wide_US$pid_leanR==1, 7,
                                                                   
                                                                   NA)))))))
table(data_wide_US$pid7)

data_wide_US$pid3 <- car::recode(data_wide_US$pid7, "1:3=1; 4=2; 5:7=3")
table(data_wide_US$pid3)

data_wide_US$`Respondent Partisanship` <- as.factor(data_wide_US$pid3)
levels(data_wide_US$`Respondent Partisanship`) <- c("Democrat", "Independent", "Republican")

# Dataset for merging
partisanship_df_US <- data.frame(data_wide_US$ResponseId, data_wide_US$`Respondent Partisanship`)
colnames(partisanship_df_US)= c("Response.ID", "PID")
partisanship_df_US <- dplyr::filter(partisanship_df_US, PID %in% c("Democrat", "Republican"))
write.csv(partisanship_df_US, "partisanship_df_US.csv", row.names = FALSE)


# Populist Attitudes (See Appendix K for details)

# Recode so that higher values = more populist

table(data_wide_US$ppl1)
table(data_wide_US$ppl2) # reverse coded
table(data_wide_US$ppl3)
table(data_wide_US$ant1)
table(data_wide_US$ant2) # reverse coded
table(data_wide_US$ant3)
table(data_wide_US$man1)
table(data_wide_US$man2) # reverse coded
table(data_wide_US$man3)

data_wide_US$Pop1 <- car::recode(data_wide_US$ppl1, "1=5; 2=4; 3=3; 4=2; 5=1")
data_wide_US$Pop2 <- data_wide_US$ppl2
data_wide_US$Pop3 <- car::recode(data_wide_US$ppl3, "1=5; 2=4; 3=3; 4=2; 5=1")
data_wide_US$Pop4 <- car::recode(data_wide_US$ant1, "1=5; 2=4; 3=3; 4=2; 5=1")
data_wide_US$Pop5 <- data_wide_US$ant2
data_wide_US$Pop6 <- car::recode(data_wide_US$ant3, "1=5; 2=4; 3=3; 4=2; 5=1")
data_wide_US$Pop7 <- car::recode(data_wide_US$man1, "1=5; 2=4; 3=3; 4=2; 5=1")
data_wide_US$Pop8 <- data_wide_US$man2
data_wide_US$Pop9 <- car::recode(data_wide_US$man3, "1=5; 2=4; 3=3; 4=2; 5=1")

# Prepare Data for Factor Analysis
thin_pop_US <- data_wide_US %>% select("ResponseId", "Pop1", "Pop2", "Pop3", "Pop4", "Pop5", 
                                 "Pop6", "Pop7", "Pop8", "Pop9") %>%
  distinct(ResponseId, .keep_all = T) 


# Run Initial EFA model
fa_thin_pop_US <- fa(thin_pop_US[,2:10], nfactors = 1, scores = "regression", cor = "poly")

# Examine factor loadings
fa_thin_pop_US$loadings

# Revised EFA model (threshold 0.4; See Appendix K)
# Prepare Data Factor Analysis
thin_pop_US <- data_wide_US %>% select("ResponseId", "Pop1", "Pop3", "Pop4", "Pop5", 
                                       "Pop6") %>%
  distinct(ResponseId, .keep_all = T)

# Run Revised EFA model
fa_thin_pop_US_threshold <- fa(thin_pop_US[,2:6], nfactors = 1, scores = "regression", cor = "poly")

# Save factor scores
fa_scores_threshold_US <- data_frame(ResponseId = thin_pop_US$ResponseId,
                                     scores = fa_thin_pop_US_threshold$scores)

# Append factor scores
data_wide_US <- left_join(data_wide_US, fa_scores_threshold_US, by = 'ResponseId')

# Create Populist Attitudes Mean Split
mean(data_wide_US$scores)
data_wide_US <- mutate(data_wide_US, 
                       populist_factor = as.factor(case_when(scores >= -3.244415e-17 ~ 'Populist',
                                                            scores < -3.244415e-17 ~ 'Non-populist')))

# Create Dataset with Response.ID and Populist Attitudes for merging
populism_df_US <- data.frame(data_wide_US$ResponseId, data_wide_US$populist_factor)
colnames(populism_df_US)= c("Response.ID", "Populism_Factor")
write.csv(populism_df_US, "populism_df_US.csv", row.names = FALSE)


## Prepare Demographic Tables for Appendix A Tables

# Demographics by Sample
data_wide_US_s1 <- data_wide_US[data_wide_US$Sample=="ONE",]
data_wide_US_s2 <- data_wide_US[data_wide_US$Sample=="TWO",]

# Sample 1 Demographics
demographics_s1 <- select(data_wide_US_s1, gender, race, agegroup, educ, income, `Respondent Partisanship`) %>%
  rename(., Sex = gender,
         `Race or Ethnicity` = race,
         Age = agegroup,
         Education = educ,
         `Household Income` = income,
         Partisanship = `Respondent Partisanship`)

# Table is created in script_06 or by "uncommenting" line below
# datasummary_skim(demographics_s1, output = 'tableA1.html')

# Sample 2 Demographics
demographics_s2 <- select(data_wide_US_s2, gender, race, agegroup, educ, income, `Respondent Partisanship`) %>%
  rename(., Sex = gender,
         `Race or Ethnicity` = race,
         Age = agegroup,
         Education = educ,
         `Household Income` = income,
         Partisanship = `Respondent Partisanship`)

# Table is created in script_06 or by "uncommenting" line below
# datasummary_skim(demographics_s2, output = 'tableA2.html')





